In [1]:
import numpy as np
import scipy as sc
import matplotlib.pyplot as plt
from prettyprint import pp
import os, re
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import BernoulliNB, GaussianNB, MultinomialNB
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score, precision_score, recall_score, classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC, NuSVC, SVC
from sklearn.grid_search import GridSearchCV
from datetime import datetime as dt
from ipy_table import *
%matplotlib inline
In [2]:
root_path = 'E:/University Central/Modern Information Retrieval/Project/Project Phase 2/20_newsgroup/'
#top_view folders
folders = [root_path + folder + '/' for folder in os.listdir(root_path)]
#there are only 4 classes
class_titles = os.listdir(root_path)
#list of all the files belonging to each class
files = {}
for folder, title in zip(folders, class_titles):
files[title] = [folder + f for f in os.listdir(folder)]
In [3]:
train_test_ratio = 0.75
def train_test_split(ratio, classes, files):
"""
this method will split the input list of files to train and test sets.
*Note: currently this method uses the simplest way an array can be split in two parts.
Parameters
----------
ratio: float
ratio of total documents in each class assigned to the training set
classes: list
list of label classes
files: dictionary
a dictionary with list of files for each class
Returns
-------
train_dic: dictionary
a dictionary with lists of documents in the training set for each class
test_dict: dictionary
a dictionary with lists of documents in the testing set for each class
"""
train_dict = {}
test_dict = {}
for cl in classes:
train_cnt = int(ratio * len(files[cl]))
train_dict[cl] = files[cl][:train_cnt]
test_dict[cl] = files[cl][train_cnt:]
return train_dict, test_dict
In [4]:
train_path, test_path = train_test_split(train_test_ratio, class_titles, files)
In [5]:
pattern = re.compile(r'([a-zA-Z]+|[0-9]+(\.[0-9]+)?)')
def cleanupText(path):
"""
this method will read in a text file and try to cleanup its text.
Parameters
----------
path: str
path to the document file
Returns
-------
text_translated: str
cleaned up version of the raw text in the input file
"""
from string import punctuation, digits
text_translated = ''
try:
f = open(path)
raw = f.read().lower()
text = pattern.sub(r' \1 ', raw.replace('\n', ' '))
text_translated = text.translate(None, punctuation + digits)
text_translated = ' '.join([word for word in text_translated.split(' ') if (word and len(word) > 1)])
finally:
f.close()
return text_translated
In [6]:
train_arr = []
test_arr = []
train_lbl = []
test_lbl = []
for cl in class_titles:
for path in train_path[cl]:
train_arr.append(cleanupText(path))
train_lbl.append(cl)
for path in test_path[cl]:
test_arr.append(cleanupText(path))
test_lbl.append(cl)
print len(train_arr)
print len(test_arr)
In [7]:
vectorizer = CountVectorizer()
vectorizer.fit(train_arr)
train_mat = vectorizer.transform(train_arr)
print train_mat.shape
test_mat = vectorizer.transform(test_arr)
print test_mat.shape
In [8]:
tfidf = TfidfTransformer()
tfidf.fit(train_mat)
train_tfmat = tfidf.transform(train_mat)
print train_tfmat.shape
test_tfmat = tfidf.transform(test_mat)
print test_tfmat.shape
In [9]:
def testClassifier(x_train, y_train, x_test, y_test, clf):
"""
this method will first train the classifier on the training data
and will then test the trained classifier on test data.
Finally it will report some metrics on the classifier performance.
Parameters
----------
x_train: np.ndarray
train data matrix
y_train: list
train data label
x_test: np.ndarray
test data matrix
y_test: list
test data label
clf: sklearn classifier object implementing fit() and predict() methods
Returns
-------
metrics: list
[training time, testing time, recall and precision for every class, macro-averaged F1 score]
"""
metrics = []
start = dt.now()
clf.fit(x_train, y_train)
end = dt.now()
print 'training time: ', (end - start)
# add training time to metrics
metrics.append(end-start)
start = dt.now()
yhat = clf.predict(x_test)
end = dt.now()
print 'testing time: ', (end - start)
# add testing time to metrics
metrics.append(end-start)
print 'classification report: '
# print classification_report(y_test, yhat)
pp(classification_report(y_test, yhat))
print 'f1 score'
print f1_score(y_test, yhat, average='macro')
print 'accuracy score'
print accuracy_score(y_test, yhat)
precision = precision_score(y_test, yhat, average=None)
recall = recall_score(y_test, yhat, average=None)
# add precision and recall values to metrics
for p, r in zip(precision, recall):
metrics.append(p)
metrics.append(r)
#add macro-averaged F1 score to metrics
metrics.append(f1_score(y_test, yhat, average='macro'))
print 'confusion matrix:'
print confusion_matrix(y_test, yhat)
# plotting the confusion matrix
plt.imshow(confusion_matrix(y_test, yhat), interpolation='nearest')
plt.show()
return metrics
In [10]:
metrics_dict = []
#'name', 'metrics'
In [11]:
bnb = BernoulliNB()
bnb_me = testClassifier(train_tfmat, train_lbl, test_tfmat, test_lbl, bnb)
metrics_dict.append({'name':'BernoulliNB', 'metrics':bnb_me})
In [12]:
gnb = GaussianNB()
gnb_me = testClassifier(train_tfmat.toarray(), train_lbl, test_tfmat.toarray(), test_lbl, gnb)
metrics_dict.append({'name':'GaussianNB', 'metrics':gnb_me})
In [13]:
mnb = MultinomialNB()
mnb_me = testClassifier(train_tfmat.toarray(), train_lbl, test_tfmat.toarray(), test_lbl, mnb)
metrics_dict.append({'name':'MultinomialNB', 'metrics':mnb_me})
In [14]:
# for nn in [5, 10, 15]:
for nn in [5]:
print 'knn with ', nn, ' neighbors'
knn = KNeighborsClassifier(n_neighbors=nn)
knn_me = testClassifier(train_tfmat, train_lbl, test_tfmat, test_lbl, knn)
metrics_dict.append({'name':'5NN', 'metrics':knn_me})
print ' '
In [15]:
lsvm = LinearSVC()
lsvm_me = testClassifier(train_tfmat, train_lbl, test_tfmat, test_lbl, lsvm)
metrics_dict.append({'name':'LinearSVM', 'metrics':lsvm_me})
In [16]:
nusvm = NuSVC()
nusvm_me = testClassifier(train_tfmat, train_lbl, test_tfmat, test_lbl, nusvm)
metrics_dict.append({'name':'nuSVM', 'metrics':nusvm_me})
In [17]:
rbfsvm = SVC()
rbfsvm_me = testClassifier(train_tfmat, train_lbl, test_tfmat, test_lbl, rbfsvm)
metrics_dict.append({'name':'SVM with RBF kernel', 'metrics':rbfsvm_me})
alpha : float, optional (default=1.0)
Additive (Laplace/Lidstone) smoothing parameter
(0 for no smoothing).
binarize : float or None, optional
Threshold for binarizing (mapping to booleans) of sample features.
If None, input is presumed to already consist of binary vectors.
fit_prior : boolean
Whether to learn class prior probabilities or not.
If false, a uniform prior will be used.
class_prior : array-like, size=[n_classes,]
Prior probabilities of the classes. If specified the priors are not
adjusted according to the data.
Note: since classes are balanced, their priors are equal.
In [21]:
bnb_params = {'alpha': [a*0.1 for a in range(0,11)]}
bnb_clf = GridSearchCV(BernoulliNB(), bnb_params, cv=10)
bnb_clf.fit(train_tfmat, train_lbl)
print 'best parameters'
print bnb_clf.best_params_
best_bnb = BernoulliNB(alpha=bnb_clf.best_params_['alpha'])
best_bnb_me = testClassifier(train_tfmat, train_lbl, test_tfmat, test_lbl, best_bnb)
metrics_dict.append({'name':'Best BernoulliNB', 'metrics':best_bnb_me})
In [22]:
best_gnb = GaussianNB()
best_gnb_me = testClassifier(train_tfmat.toarray(), train_lbl, test_tfmat.toarray(), test_lbl, best_gnb)
metrics_dict.append({'name':'Best GaussianNB', 'metrics':best_gnb_me})
alpha : float, optional (default=1.0)
Additive (Laplace/Lidstone) smoothing parameter
(0 for no smoothing).
fit_prior : boolean
Whether to learn class prior probabilities or not.
If false, a uniform prior will be used.
class_prior : array-like, size (n_classes,)
Prior probabilities of the classes. If specified the priors are not
adjusted according to the data.
In [25]:
mbn_params = {'alpha': [a*0.1 for a in range(0,11)]}
mbn_clf = GridSearchCV(MultinomialNB(), mbn_params, cv=10)
mbn_clf.fit(train_tfmat, train_lbl)
print 'best parameters'
print mbn_clf.best_params_
best_mbn = MultinomialNB(alpha=mbn_clf.best_params_['alpha'])
best_mbn_me = testClassifier(train_tfmat, train_lbl, test_tfmat, test_lbl, best_mbn)
metrics_dict.append({'name':'Best MultinomialNB', 'metrics':best_mbn_me})
n_neighbors : int, optional (default = 5)
Number of neighbors to use by default for :meth:`k_neighbors` queries.
weights : str or callable
weight function used in prediction. Possible values:
- 'uniform' : uniform weights. All points in each neighborhood
are weighted equally.
- 'distance' : weight points by the inverse of their distance.
in this case, closer neighbors of a query point will have a
greater influence than neighbors which are further away.
- [callable] : a user-defined function which accepts an
array of distances, and returns an array of the same shape
containing the weights.
Uniform weights are used by default.
algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional
Algorithm used to compute the nearest neighbors:
- 'ball_tree' will use :class:`BallTree`
- 'kd_tree' will use :class:`KDTree`
- 'brute' will use a brute-force search.
- 'auto' will attempt to decide the most appropriate algorithm
based on the values passed to :meth:`fit` method.
Note: fitting on sparse input will override the setting of
this parameter, using brute force.
leaf_size : int, optional (default = 30)
Leaf size passed to BallTree or KDTree. This can affect the
speed of the construction and query, as well as the memory
required to store the tree. The optimal value depends on the
nature of the problem.
metric : string or DistanceMetric object (default='minkowski')
the distance metric to use for the tree. The default metric is
minkowski, and with p=2 is equivalent to the standard Euclidean
metric. See the documentation of the DistanceMetric class for a
list of available metrics.
p : integer, optional (default = 2)
Power parameter for the Minkowski metric. When p = 1, this is
equivalent to using manhattan_distance (l1), and euclidean_distance
(l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
In [28]:
knn_params = {'n_neighbors': range(1,21), 'weights': ['uniform', 'distance'], 'algorithm': ['ball_tree', 'kd_tree'],
'leaf_size': [15, 30, 50, 100], 'p': [1,2]}
knn_clf = GridSearchCV(KNeighborsClassifier(), knn_params, cv=10)
knn_clf.fit(train_tfmat, train_lbl)
print 'best parameters'
print knn_clf.best_params_
best_knn = KNeighborsClassifier(n_neighbors=knn_clf.best_params_['n_neighbors'], weights=knn_clf.best_params_['weights'],
algorithm=knn_clf.best_params_['algorithm'], leaf_size=knn_clf.best_params_['leaf_size'])
best_knn_me = testClassifier(train_tfmat, train_lbl, test_tfmat, test_lbl, best_knn)
metrics_dict.append({'name':'Best KNN', 'metrics':best_knn_me})
C : float, optional (default=1.0)
Penalty parameter C of the error term.
loss : string, 'l1' or 'l2' (default='l2')
Specifies the loss function. 'l1' is the hinge loss (standard SVM)
while 'l2' is the squared hinge loss.
penalty : string, 'l1' or 'l2' (default='l2')
Specifies the norm used in the penalization. The 'l2'
penalty is the standard used in SVC. The 'l1' leads to `coef_`
vectors that are sparse.
dual : bool, (default=True)
Select the algorithm to either solve the dual or primal
optimization problem. Prefer dual=False when n_samples > n_features.
tol : float, optional (default=1e-4)
Tolerance for stopping criteria
multi_class: string, 'ovr' or 'crammer_singer' (default='ovr')
Determines the multi-class strategy if `y` contains more than
two classes.
`ovr` trains n_classes one-vs-rest classifiers, while `crammer_singer`
optimizes a joint objective over all classes.
While `crammer_singer` is interesting from an theoretical perspective
as it is consistent it is seldom used in practice and rarely leads to
better accuracy and is more expensive to compute.
If `crammer_singer` is chosen, the options loss, penalty and dual will
be ignored.
fit_intercept : boolean, optional (default=True)
Whether to calculate the intercept for this model. If set
to false, no intercept will be used in calculations
(e.g. data is expected to be already centered).
intercept_scaling : float, optional (default=1)
when self.fit_intercept is True, instance vector x becomes
[x, self.intercept_scaling],
i.e. a "synthetic" feature with constant value equals to
intercept_scaling is appended to the instance vector.
The intercept becomes intercept_scaling * synthetic feature weight
Note! the synthetic feature weight is subject to l1/l2 regularization
as all other features.
To lessen the effect of regularization on synthetic feature weight
(and therefore on the intercept) intercept_scaling has to be increased
class_weight : {dict, 'auto'}, optional
Set the parameter C of class i to class_weight[i]*C for
SVC. If not given, all classes are supposed to have
weight one. The 'auto' mode uses the values of y to
automatically adjust weights inversely proportional to
class frequencies.
verbose : int, default: 0
Enable verbose output. Note that this setting takes advantage of a
per-process runtime setting in liblinear that, if enabled, may not work
properly in a multithreaded context.
random_state : int seed, RandomState instance, or None (default)
The seed of the pseudo random number generator to use when
shuffling the data.
In [29]:
lsvm_params = {'C':[1,10,100,1000], 'loss':['l1', 'l2']}
lsvm_clf = GridSearchCV(LinearSVC(), lsvm_params, cv=5)
lsvm_clf.fit(train_tfmat, train_lbl)
print 'best parameters'
print lsvm_clf.best_params_
best_lsvm = LinearSVC(C=lsvm_clf.best_params_['C'], loss=lsvm_clf.best_params_['loss'])
best_lsvm_me = testClassifier(train_tfmat, train_lbl, test_tfmat, test_lbl, best_lsvm)
metrics_dict.append({'name':'Best Linear SVM', 'metrics':best_lsvm_me})
In [31]:
metrics_table = []
metrics_table.append(['', 'name', 'training time', 'testing time',
'p_1', 'r_1',
'p_2', 'r_2',
'p_3', 'r_3',
'p_4', 'r_4',
'macro-averaged F1 score'
])
i = 0
for me in metrics_dict:
i += 1
metric = []
metric.append(i)
metric.append(me['name'])
for m in me['metrics']:
metric.append(m)
metrics_table.append(metric)
make_table(metrics_table)
# styling
apply_theme('basic_both')
set_column_style(12, align='center')
Out[31]: